# makePredictorMatrix.R

makePredictorMatrix <- function (depvarName, reqd.vars, cand.vars, ID = NULL) {
  # To start, I combine reqd.var and candidate.vars.  Then I limit the combined
  # data frame to cases corresponding to subjects who were assigned to be asked
  # the "depvarName" question.  This limitation excludes (a) years in which 
  # the question was asked of no one, and (b) subjects who were not assigned  
  # to be asked the question because of split-ballot sampling.   I finish the  
  # restriction process by picking only those predictors that will actually  
  # help with imputation.  
    
  if (! 'character' %in% class(depvarName)) { 
    stop('depvarName must be a string')
  }
  if (!is.null(ID) & !('character' %in% class(ID))) { 
    stop("ID must be NULL or a string.")
  }  
  if (! 'data.frame' %in% class(reqd.vars)) { 
    stop('reqd.vars must be a data frame')
  }
  if (! 'data.frame' %in% class(cand.vars)) { 
    stop('cand.vars must be a data frame')
  }
  if (! depvarName %in% colnames(reqd.vars)) { 
    stop("A variable called \"", depvarName, "\" must be in reqd.vars.")
  }  
    
  # Restrict data frame to cases in which people were actually assigned to
  # be asked the question.  
  depvar   <- get(depvarName, envir = as.environment(reqd.vars))
  imput.df <- data.frame(reqd.vars, cand.vars)[depvar!=0 | is.na(depvar),]
  
  # Restrict data frame to predictors that are nonmissing for at least half  
  # of cases.  
  missingness                    <- apply(imput.df, 2, function (x) round(sum(is.na(x))/length(x), 2) ) < .5
  missingness[1:ncol(reqd.vars)] <- TRUE  # These are the variables that I use in my IV analysis
  imput.df                       <- imput.df[, missingness] 
  
  # Restrict the set of predictors to those with R^2 >= .005.    
  rsq <- rep(NA, ncol(imput.df))
  for (i in 1:length(rsq)) {
    
    # Don't test the respondent ID column.  [2013 02 25]
    if (!is.null(ID)) {
      if (colnames(imput.df)[i] == ID) {
        rsq[i] <- 1
        next
      }
    }
    
    # Skip the regression if the predictor is the same as the dependent variable.
    if (depvarName == colnames(imput.df)[i]) { next }
  
    # Run the regression and record the R^2.
    rsq[i] <- summary(lm(imput.df[, depvarName] ~ imput.df[, i], data=imput.df))$r.squared  
  }
  rsq <- rsq >= .005
  rsq[1:ncol(reqd.vars)] <- TRUE  # These are the variables that I use in my IV analysis
  imput.df <- imput.df[, rsq]
  
  # Omit cases in which all values are NA.  amelia() does this automatically; 
  # I do it here to avoid the warning message that it gives if it omits these 
  # cases. 
  allNARows <- which(apply(imput.df, 1, function (x) sum(!is.na(x)))==0)   
  if (length(allNARows) > 0) {
    imput.df <- imput.df[-allNARows, ]
  }
  
  # Error-checking before return 
  if (nrow(imput.df) == 0) {
    warning("imput.df has 0 rows")
  }
  if (ncol(imput.df) == 0) {
    warning("imput.df has 0 columns")
  }  
  if (sum(is.na(get(depvarName, envir = as.environment(imput.df)))) == 0) {
    warning("No NA values in imput.df$", depvarname, ", so there is nothing to impute for this variable.")
  }

  # Return
  return(imput.df)
}





